import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import jieba
from collections import Counter
from scipy.sparse import coo_matrix

# ========== 1. 数据预处理 ==========
corpus = [
    "我们 喜欢 深度 学习",
    "自然 语言 处理 是 有趣 的",
    "人工智能 改变 了 世界",
    "深度 学习 是 人工智能 的 重要 组成部分"
]

# 分词
tokenized_corpus = [list(jieba.cut(sentence)) for sentence in corpus]
vocab = set(word for sentence in tokenized_corpus for word in sentence)
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}

# 计算共现矩阵
window_size = 2
co_occurrence = Counter()

for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    for center_idx in range(len(indices)):
        center_word = indices[center_idx]
        for offset in range(-window_size, window_size + 1):
            context_idx = center_idx + offset
            if 0 <= context_idx < len(indices) and context_idx != center_idx:
                context_word = indices[context_idx]
                co_occurrence[(center_word, context_word)] += 1

# 转换为稀疏矩阵
rows, cols, values = zip(*[(c[0], c[1], v) for c, v in co_occurrence.items()])
X = coo_matrix((values, (rows, cols)), shape=(len(vocab), len(vocab)))


# ========== 2. 定义 GloVe 模型 ==========
class GloVe(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(GloVe, self).__init__()
        self.w_embeddings = nn.Embedding(vocab_size, embedding_dim)  # 中心词嵌入
        self.c_embeddings = nn.Embedding(vocab_size, embedding_dim)  # 上下文词嵌入
        self.w_bias = nn.Embedding(vocab_size, 1)  # 中心词偏置
        self.c_bias = nn.Embedding(vocab_size, 1)  # 上下文词偏置
        nn.init.xavier_uniform_(self.w_embeddings.weight)
        nn.init.xavier_uniform_(self.c_embeddings.weight)

    def forward(self, center, context, co_occur):
        w_emb = self.w_embeddings(center)
        c_emb = self.c_embeddings(context)
        w_bias = self.w_bias(center).squeeze()
        c_bias = self.c_bias(context).squeeze()
        dot_product = (w_emb * c_emb).sum(dim=1)
        loss = (dot_product + w_bias + c_bias - torch.log(co_occur + 1e-8)) ** 2
        return loss.mean()


# 初始化模型
embedding_dim = 10
model = GloVe(len(vocab), embedding_dim)

# ========== 3. 训练 GloVe ==========
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
num_epochs = 100

# 转换数据
co_occurrence_tensor = torch.tensor(X.data, dtype=torch.float)
pairs = list(zip(X.row, X.col, co_occurrence_tensor))

for epoch in range(num_epochs):
    total_loss = 0
    np.random.shuffle(pairs)
    for center, context, co_occur in pairs:
        optimizer.zero_grad()
        loss = model(
            torch.tensor([center], dtype=torch.long),
            torch.tensor([context], dtype=torch.long),
            torch.tensor([co_occur], dtype=torch.float)  # 修正数据类型
        )
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss:.4f}")

# ========== 4. 获取词向量 ==========
word_vectors = model.w_embeddings.weight.data.numpy()


# ========== 5. 计算相似度 ==========
def most_similar(word, top_n=3):
    if word not in word2idx:
        return "单词不在词汇表中"

    word_vec = word_vectors[word2idx[word]].reshape(1, -1)
    similarities = np.dot(word_vectors, word_vec.T).squeeze()
    similar_idx = similarities.argsort()[::-1][1:top_n + 1]
    return [(idx2word[idx], similarities[idx]) for idx in similar_idx]


# 测试
test_words = ["深度", "学习", "人工智能"]
for word in test_words:
    print(f"【{word}】的相似单词:", most_similar(word))
